This example is from Stock Market Data Analysis with Python
Also check this Stock Market Prediction in Python
In [1]:
import pandas as pd
# from pandas.io import data, wb # Package and modules for importing data; this code may change depending on pandas version
import datetime
from pandas_datareader import data, wb
In [2]:
# We will look at stock prices over the past year, starting at January 1, 2016
start = datetime.datetime(2016,1,1)
end = datetime.date.today()
In [3]:
# Let's get Apple stock data; Apple's ticker symbol is AAPL
# First argument is the series we want, second is the source ("yahoo" for Yahoo! Finance), third is the start date, fourth is the end date
apple = data.DataReader("AAPL", "yahoo", start, end)
type(apple)
Out[3]:
In [4]:
apple.head()
Out[4]:
In [5]:
import matplotlib.pyplot as plt # Import matplotlib
# This line is necessary for the plot to appear in a Jupyter notebook
%matplotlib inline
# Control the default size of figures in this Jupyter notebook
%pylab inline
In [6]:
pylab.rcParams['figure.figsize'] = (15, 9) # Change the size of plots
apple["Adj Close"].plot(grid = True) # Plot the adjusted closing price of AAPL
Out[6]:
In [7]:
from matplotlib.dates import DateFormatter, WeekdayLocator,\
DayLocator, MONDAY
from matplotlib.finance import candlestick_ohlc
def pandas_candlestick_ohlc(dat, stick = "day", otherseries = None):
"""
:param dat: pandas DataFrame object with datetime64 index, and float columns "Open", "High", "Low", and "Close", likely created via DataReader from "yahoo"
:param stick: A string or number indicating the period of time covered by a single candlestick. Valid string inputs include "day", "week", "month", and "year", ("day" default), and any numeric input indicates the number of trading days included in a period
:param otherseries: An iterable that will be coerced into a list, containing the columns of dat that hold other series to be plotted as lines
This will show a Japanese candlestick plot for stock data stored in dat, also plotting other series if passed.
"""
mondays = WeekdayLocator(MONDAY) # major ticks on the mondays
alldays = DayLocator() # minor ticks on the days
dayFormatter = DateFormatter('%d') # e.g., 12
# Create a new DataFrame which includes OHLC data for each period specified by stick input
transdat = dat.loc[:,["Open", "High", "Low", "Close"]]
if (type(stick) == str):
if stick == "day":
plotdat = transdat
stick = 1 # Used for plotting
elif stick in ["week", "month", "year"]:
if stick == "week":
transdat["week"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[1]) # Identify weeks
elif stick == "month":
transdat["month"] = pd.to_datetime(transdat.index).map(lambda x: x.month) # Identify months
transdat["year"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[0]) # Identify years
grouped = transdat.groupby(list(set(["year",stick]))) # Group by year and other appropriate variable
plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
for name, group in grouped:
plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
"High": max(group.High),
"Low": min(group.Low),
"Close": group.iloc[-1,3]},
index = [group.index[0]]))
if stick == "week": stick = 5
elif stick == "month": stick = 30
elif stick == "year": stick = 365
elif (type(stick) == int and stick >= 1):
transdat["stick"] = [np.floor(i / stick) for i in range(len(transdat.index))]
grouped = transdat.groupby("stick")
plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
for name, group in grouped:
plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
"High": max(group.High),
"Low": min(group.Low),
"Close": group.iloc[-1,3]},
index = [group.index[0]]))
else:
raise ValueError('Valid inputs to argument "stick" include the strings "day", "week", "month", "year", or a positive integer')
# Set plot parameters, including the axis object ax used for plotting
fig, ax = plt.subplots()
fig.subplots_adjust(bottom=0.2)
if plotdat.index[-1] - plotdat.index[0] < pd.Timedelta('730 days'):
weekFormatter = DateFormatter('%b %d') # e.g., Jan 12
ax.xaxis.set_major_locator(mondays)
ax.xaxis.set_minor_locator(alldays)
else:
weekFormatter = DateFormatter('%b %d, %Y')
ax.xaxis.set_major_formatter(weekFormatter)
ax.grid(True)
# Create the candelstick chart
candlestick_ohlc(ax, list(zip(list(date2num(plotdat.index.tolist())), plotdat["Open"].tolist(), plotdat["High"].tolist(),
plotdat["Low"].tolist(), plotdat["Close"].tolist())),
colorup = "black", colordown = "red", width = stick * .4)
# Plot other series (such as moving averages) as lines
if otherseries != None:
if type(otherseries) != list:
otherseries = [otherseries]
dat.loc[:,otherseries].plot(ax = ax, lw = 1.3, grid = True)
ax.xaxis_date()
ax.autoscale_view()
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()
pandas_candlestick_ohlc(apple)
In [8]:
microsoft = data.DataReader("MSFT", "yahoo", start, end)
google = data.DataReader("GOOG", "yahoo", start, end)
# Below I create a DataFrame consisting of the adjusted closing price of these stocks, first by making a list of these objects and using the join method
stocks = pd.DataFrame({"AAPL": apple["Adj Close"],
"MSFT": microsoft["Adj Close"],
"GOOG": google["Adj Close"]})
stocks.head()
Out[8]:
In [9]:
stocks.plot(grid = True)
Out[9]:
In [10]:
stocks.plot(secondary_y = ["AAPL", "MSFT"], grid = True)
Out[10]:
In [11]:
stocks.plot(secondary_y = ["AAPL", "MSFT"], grid = True)
Out[11]:
In [12]:
# df.apply(arg) will apply the function arg to each column in df, and return a DataFrame with the result
# Recall that lambda x is an anonymous function accepting parameter x; in this case, x will be a pandas Series object
stock_return = stocks.apply(lambda x: x / x[0])
stock_return.head()
Out[12]:
In [13]:
stock_return.plot(grid = True).axhline(y = 1, color = "black", lw = 2)
Out[13]:
In [15]:
# Let's use NumPy's log function, though math's log function would work just as well
import numpy as np
stock_change = stocks.apply(lambda x: np.log(x) - np.log(x.shift(1))) # shift moves dates back by 1.
stock_change.head()
Out[15]:
In [16]:
stock_change.plot(grid = True).axhline(y = 0, color = "black", lw = 2)
Out[16]:
In [17]:
apple["20d"] = np.round(apple["Close"].rolling(window = 20, center = False).mean(), 2)
pandas_candlestick_ohlc(apple.loc['2016-01-04':'2016-08-07',:], otherseries = "20d")
In [18]:
apple["50d"] = np.round(apple["Close"].rolling(window = 50, center = False).mean(), 2)
apple["200d"] = np.round(apple["Close"].rolling(window = 200, center = False).mean(), 2)
pandas_candlestick_ohlc(apple.loc['2016-01-04':'2016-08-07',:], otherseries = ["20d", "50d", "200d"])
In [19]:
apple['20d-50d'] = apple['20d'] - apple['50d']
apple.tail()
Out[19]:
In [20]:
# np.where() is a vectorized if-else function, where a condition is checked for each component of a vector, and the first argument passed is used when the condition holds, and the other passed if it does not
apple["Regime"] = np.where(apple['20d-50d'] > 0, 1, 0)
# We have 1's for bullish regimes and 0's for everything else. Below I replace bearish regimes's values with -1, and to maintain the rest of the vector, the second argument is apple["Regime"]
apple["Regime"] = np.where(apple['20d-50d'] < 0, -1, apple["Regime"])
apple.loc['2016-01-01':'2016-08-07',"Regime"].plot(ylim = (-2,2)).axhline(y = 0, color = "black", lw = 2)
Out[20]:
In [21]:
apple["Regime"].plot(ylim = (-2,2)).axhline(y = 0, color = "black", lw = 2)
Out[21]:
In [22]:
apple["Regime"].value_counts()
Out[22]:
In [23]:
# To ensure that all trades close out, I temporarily change the regime of the last row to 0
regime_orig = apple.ix[-1, "Regime"]
apple.ix[-1, "Regime"] = 0
apple["Signal"] = np.sign(apple["Regime"] - apple["Regime"].shift(1))
# Restore original regime data
apple.ix[-1, "Regime"] = regime_orig
apple.tail()
Out[23]:
In [24]:
apple["Signal"].plot(ylim = (-2, 2))
Out[24]:
In [25]:
apple["Signal"].value_counts()
Out[25]:
In [26]:
apple.loc[apple["Signal"] == 1, "Close"]
Out[26]:
In [27]:
apple.loc[apple["Signal"] == -1, "Close"]
Out[27]:
In [28]:
# Create a DataFrame with trades, including the price at the trade and the regime under which the trade is made.
apple_signals = pd.concat([
pd.DataFrame({"Price": apple.loc[apple["Signal"] == 1, "Close"],
"Regime": apple.loc[apple["Signal"] == 1, "Regime"],
"Signal": "Buy"}),
pd.DataFrame({"Price": apple.loc[apple["Signal"] == -1, "Close"],
"Regime": apple.loc[apple["Signal"] == -1, "Regime"],
"Signal": "Sell"}),
])
apple_signals.sort_index(inplace = True)
apple_signals
Out[28]:
In [29]:
# Let's see the profitability of long trades
apple_long_profits = pd.DataFrame({
"Price": apple_signals.loc[(apple_signals["Signal"] == "Buy") &
apple_signals["Regime"] == 1, "Price"],
"Profit": pd.Series(apple_signals["Price"] - apple_signals["Price"].shift(1)).loc[
apple_signals.loc[(apple_signals["Signal"].shift(1) == "Buy") & (apple_signals["Regime"].shift(1) == 1)].index
].tolist(),
"End Date": apple_signals["Price"].loc[
apple_signals.loc[(apple_signals["Signal"].shift(1) == "Buy") & (apple_signals["Regime"].shift(1) == 1)].index
].index
})
apple_long_profits
Out[29]:
In [30]:
pandas_candlestick_ohlc(apple, stick = 45, otherseries = ["20d", "50d", "200d"])
In [31]:
def ohlc_adj(dat):
"""
:param dat: pandas DataFrame with stock data, including "Open", "High", "Low", "Close", and "Adj Close", with "Adj Close" containing adjusted closing prices
:return: pandas DataFrame with adjusted stock data
This function adjusts stock data for splits, dividends, etc., returning a data frame with
"Open", "High", "Low" and "Close" columns. The input DataFrame is similar to that returned
by pandas Yahoo! Finance API.
"""
return pd.DataFrame({"Open": dat["Open"] * dat["Adj Close"] / dat["Close"],
"High": dat["High"] * dat["Adj Close"] / dat["Close"],
"Low": dat["Low"] * dat["Adj Close"] / dat["Close"],
"Close": dat["Adj Close"]})
apple_adj = ohlc_adj(apple)
# This next code repeats all the earlier analysis we did on the adjusted data
apple_adj["20d"] = np.round(apple_adj["Close"].rolling(window = 20, center = False).mean(), 2)
apple_adj["50d"] = np.round(apple_adj["Close"].rolling(window = 50, center = False).mean(), 2)
apple_adj["200d"] = np.round(apple_adj["Close"].rolling(window = 200, center = False).mean(), 2)
apple_adj['20d-50d'] = apple_adj['20d'] - apple_adj['50d']
# np.where() is a vectorized if-else function, where a condition is checked for each component of a vector, and the first argument passed is used when the condition holds, and the other passed if it does not
apple_adj["Regime"] = np.where(apple_adj['20d-50d'] > 0, 1, 0)
# We have 1's for bullish regimes and 0's for everything else. Below I replace bearish regimes's values with -1, and to maintain the rest of the vector, the second argument is apple["Regime"]
apple_adj["Regime"] = np.where(apple_adj['20d-50d'] < 0, -1, apple_adj["Regime"])
# To ensure that all trades close out, I temporarily change the regime of the last row to 0
regime_orig = apple_adj.ix[-1, "Regime"]
apple_adj.ix[-1, "Regime"] = 0
apple_adj["Signal"] = np.sign(apple_adj["Regime"] - apple_adj["Regime"].shift(1))
# Restore original regime data
apple_adj.ix[-1, "Regime"] = regime_orig
# Create a DataFrame with trades, including the price at the trade and the regime under which the trade is made.
apple_adj_signals = pd.concat([
pd.DataFrame({"Price": apple_adj.loc[apple_adj["Signal"] == 1, "Close"],
"Regime": apple_adj.loc[apple_adj["Signal"] == 1, "Regime"],
"Signal": "Buy"}),
pd.DataFrame({"Price": apple_adj.loc[apple_adj["Signal"] == -1, "Close"],
"Regime": apple_adj.loc[apple_adj["Signal"] == -1, "Regime"],
"Signal": "Sell"}),
])
apple_adj_signals.sort_index(inplace = True)
apple_adj_long_profits = pd.DataFrame({
"Price": apple_adj_signals.loc[(apple_adj_signals["Signal"] == "Buy") &
apple_adj_signals["Regime"] == 1, "Price"],
"Profit": pd.Series(apple_adj_signals["Price"] - apple_adj_signals["Price"].shift(1)).loc[
apple_adj_signals.loc[(apple_adj_signals["Signal"].shift(1) == "Buy") & (apple_adj_signals["Regime"].shift(1) == 1)].index
].tolist(),
"End Date": apple_adj_signals["Price"].loc[
apple_adj_signals.loc[(apple_adj_signals["Signal"].shift(1) == "Buy") & (apple_adj_signals["Regime"].shift(1) == 1)].index
].index
})
pandas_candlestick_ohlc(apple_adj, stick = 45, otherseries = ["20d", "50d", "200d"])
In [32]:
apple_adj_long_profits
Out[32]:
In [33]:
# We need to get the low of the price during each trade.
tradeperiods = pd.DataFrame({"Start": apple_adj_long_profits.index,
"End": apple_adj_long_profits["End Date"]})
apple_adj_long_profits["Low"] = tradeperiods.apply(lambda x: min(apple_adj.loc[x["Start"]:x["End"], "Low"]), axis = 1)
apple_adj_long_profits
Out[33]:
In [34]:
# Now we have all the information needed to simulate this strategy in apple_adj_long_profits
cash = 1000000
apple_backtest = pd.DataFrame({"Start Port. Value": [],
"End Port. Value": [],
"End Date": [],
"Shares": [],
"Share Price": [],
"Trade Value": [],
"Profit per Share": [],
"Total Profit": [],
"Stop-Loss Triggered": []})
port_value = .1 # Max proportion of portfolio bet on any trade
batch = 100 # Number of shares bought per batch
stoploss = .2 # % of trade loss that would trigger a stoploss
for index, row in apple_adj_long_profits.iterrows():
batches = np.floor(cash * port_value) // np.ceil(batch * row["Price"]) # Maximum number of batches of stocks invested in
trade_val = batches * batch * row["Price"] # How much money is put on the line with each trade
if row["Low"] < (1 - stoploss) * row["Price"]: # Account for the stop-loss
share_profit = np.round((1 - stoploss) * row["Price"], 2)
stop_trig = True
else:
share_profit = row["Profit"]
stop_trig = False
profit = share_profit * batches * batch # Compute profits
# Add a row to the backtest data frame containing the results of the trade
apple_backtest = apple_backtest.append(pd.DataFrame({
"Start Port. Value": cash,
"End Port. Value": cash + profit,
"End Date": row["End Date"],
"Shares": batch * batches,
"Share Price": row["Price"],
"Trade Value": trade_val,
"Profit per Share": share_profit,
"Total Profit": profit,
"Stop-Loss Triggered": stop_trig
}, index = [index]))
cash = max(0, cash + profit)
apple_backtest
Out[34]:
In [35]:
apple_backtest["End Port. Value"].plot()
Out[35]:
In [37]:
def ma_crossover_orders(stocks, fast, slow):
"""
:param stocks: A list of tuples, the first argument in each tuple being a string containing the ticker symbol of each stock (or however you want the stock represented, so long as it's unique), and the second being a pandas DataFrame containing the stocks, with a "Close" column and indexing by date (like the data frames returned by the Yahoo! Finance API)
:param fast: Integer for the number of days used in the fast moving average
:param slow: Integer for the number of days used in the slow moving average
:return: pandas DataFrame containing stock orders
This function takes a list of stocks and determines when each stock would be bought or sold depending on a moving average crossover strategy, returning a data frame with information about when the stocks in the portfolio are bought or sold according to the strategy
"""
fast_str = str(fast) + 'd'
slow_str = str(slow) + 'd'
ma_diff_str = fast_str + '-' + slow_str
trades = pd.DataFrame({"Price": [], "Regime": [], "Signal": []})
for s in stocks:
# Get the moving averages, both fast and slow, along with the difference in the moving averages
s[1][fast_str] = np.round(s[1]["Close"].rolling(window = fast, center = False).mean(), 2)
s[1][slow_str] = np.round(s[1]["Close"].rolling(window = slow, center = False).mean(), 2)
s[1][ma_diff_str] = s[1][fast_str] - s[1][slow_str]
# np.where() is a vectorized if-else function, where a condition is checked for each component of a vector, and the first argument passed is used when the condition holds, and the other passed if it does not
s[1]["Regime"] = np.where(s[1][ma_diff_str] > 0, 1, 0)
# We have 1's for bullish regimes and 0's for everything else. Below I replace bearish regimes's values with -1, and to maintain the rest of the vector, the second argument is apple["Regime"]
s[1]["Regime"] = np.where(s[1][ma_diff_str] < 0, -1, s[1]["Regime"])
# To ensure that all trades close out, I temporarily change the regime of the last row to 0
regime_orig = s[1].ix[-1, "Regime"]
s[1].ix[-1, "Regime"] = 0
s[1]["Signal"] = np.sign(s[1]["Regime"] - s[1]["Regime"].shift(1))
# Restore original regime data
s[1].ix[-1, "Regime"] = regime_orig
# Get signals
signals = pd.concat([
pd.DataFrame({"Price": s[1].loc[s[1]["Signal"] == 1, "Close"],
"Regime": s[1].loc[s[1]["Signal"] == 1, "Regime"],
"Signal": "Buy"}),
pd.DataFrame({"Price": s[1].loc[s[1]["Signal"] == -1, "Close"],
"Regime": s[1].loc[s[1]["Signal"] == -1, "Regime"],
"Signal": "Sell"}),
])
signals.index = pd.MultiIndex.from_product([signals.index, [s[0]]], names = ["Date", "Symbol"])
trades = trades.append(signals)
trades.sort_index(inplace = True)
trades.index = pd.MultiIndex.from_tuples(trades.index, names = ["Date", "Symbol"])
return trades
def backtest(signals, cash, port_value = .1, batch = 100):
"""
:param signals: pandas DataFrame containing buy and sell signals with stock prices and symbols, like that returned by ma_crossover_orders
:param cash: integer for starting cash value
:param port_value: maximum proportion of portfolio to risk on any single trade
:param batch: Trading batch sizes
:return: pandas DataFrame with backtesting results
This function backtests strategies, with the signals generated by the strategies being passed in the signals DataFrame. A fictitious portfolio is simulated and the returns generated by this portfolio are reported.
"""
SYMBOL = 1 # Constant for which element in index represents symbol
portfolio = dict() # Will contain how many stocks are in the portfolio for a given symbol
port_prices = dict() # Tracks old trade prices for determining profits
# Dataframe that will contain backtesting report
results = pd.DataFrame({"Start Cash": [],
"End Cash": [],
"Portfolio Value": [],
"Type": [],
"Shares": [],
"Share Price": [],
"Trade Value": [],
"Profit per Share": [],
"Total Profit": []})
for index, row in signals.iterrows():
# These first few lines are done for any trade
shares = portfolio.setdefault(index[SYMBOL], 0)
trade_val = 0
batches = 0
cash_change = row["Price"] * shares # Shares could potentially be a positive or negative number (cash_change will be added in the end; negative shares indicate a short)
portfolio[index[SYMBOL]] = 0 # For a given symbol, a position is effectively cleared
old_price = port_prices.setdefault(index[SYMBOL], row["Price"])
portfolio_val = 0
for key, val in portfolio.items():
portfolio_val += val * port_prices[key]
if row["Signal"] == "Buy" and row["Regime"] == 1: # Entering a long position
batches = np.floor((portfolio_val + cash) * port_value) // np.ceil(batch * row["Price"]) # Maximum number of batches of stocks invested in
trade_val = batches * batch * row["Price"] # How much money is put on the line with each trade
cash_change -= trade_val # We are buying shares so cash will go down
portfolio[index[SYMBOL]] = batches * batch # Recording how many shares are currently invested in the stock
port_prices[index[SYMBOL]] = row["Price"] # Record price
old_price = row["Price"]
elif row["Signal"] == "Sell" and row["Regime"] == -1: # Entering a short
pass
# Do nothing; can we provide a method for shorting the market?
#else:
#raise ValueError("I don't know what to do with signal " + row["Signal"])
pprofit = row["Price"] - old_price # Compute profit per share; old_price is set in such a way that entering a position results in a profit of zero
# Update report
results = results.append(pd.DataFrame({
"Start Cash": cash,
"End Cash": cash + cash_change,
"Portfolio Value": cash + cash_change + portfolio_val + trade_val,
"Type": row["Signal"],
"Shares": batch * batches,
"Share Price": row["Price"],
"Trade Value": abs(cash_change),
"Profit per Share": pprofit,
"Total Profit": batches * batch * pprofit
}, index = [index]))
cash += cash_change # Final change to cash balance
results.sort_index(inplace = True)
results.index = pd.MultiIndex.from_tuples(results.index, names = ["Date", "Symbol"])
return results
# Get more stocks
microsoft = data.DataReader("MSFT", "yahoo", start, end)
google = data.DataReader("GOOG", "yahoo", start, end)
facebook = data.DataReader("FB", "yahoo", start, end)
twitter = data.DataReader("TWTR", "yahoo", start, end)
netflix = data.DataReader("NFLX", "yahoo", start, end)
amazon = data.DataReader("AMZN", "yahoo", start, end)
yahoo = data.DataReader("YHOO", "yahoo", start, end)
sony = data.DataReader("SNY", "yahoo", start, end)
nintendo = data.DataReader("NTDOY", "yahoo", start, end)
ibm = data.DataReader("IBM", "yahoo", start, end)
hp = data.DataReader("HPQ", "yahoo", start, end)
In [38]:
signals = ma_crossover_orders([("AAPL", ohlc_adj(apple)),
("MSFT", ohlc_adj(microsoft)),
("GOOG", ohlc_adj(google)),
("FB", ohlc_adj(facebook)),
("TWTR", ohlc_adj(twitter)),
("NFLX", ohlc_adj(netflix)),
("AMZN", ohlc_adj(amazon)),
("YHOO", ohlc_adj(yahoo)),
("SNY", ohlc_adj(yahoo)),
("NTDOY", ohlc_adj(nintendo)),
("IBM", ohlc_adj(ibm)),
("HPQ", ohlc_adj(hp))],
fast = 20, slow = 50)
signals
Out[38]:
In [39]:
bk = backtest(signals, 1000000)
bk
Out[39]:
In [40]:
bk["Portfolio Value"].groupby(level = 0).apply(lambda x: x[-1]).plot()
Out[40]:
In [42]:
spyder = data.DataReader("SPY", "yahoo", start, end)
spyder.iloc[[0,-1],:]
Out[42]:
In [43]:
batches = 1000000 // np.ceil(100 * spyder.ix[0,"Adj Close"]) # Maximum number of batches of stocks invested in
trade_val = batches * batch * spyder.ix[0,"Adj Close"] # How much money is used to buy SPY
final_val = batches * batch * spyder.ix[-1,"Adj Close"] + (1000000 - trade_val) # Final value of the portfolio
final_val
Out[43]:
In [44]:
# We see that the buy-and-hold strategy beats the strategy we developed earlier. I would also like to see a plot.
ax_bench = (spyder["Adj Close"] / spyder.ix[0, "Adj Close"]).plot(label = "SPY")
ax_bench = (bk["Portfolio Value"].groupby(level = 0).apply(lambda x: x[-1]) / 1000000).plot(ax = ax_bench, label = "Portfolio")
ax_bench.legend(ax_bench.get_lines(), [l.get_label() for l in ax_bench.get_lines()], loc = 'best')
ax_bench
Out[44]:
In [ ]: